knitr::opts_chunk$set(echo = FALSE)
library(ggplot2)
library(phytools)
## Loading required package: ape
## Loading required package: maps
library(ape)
library(readr)
library(svglite)
library(plyr)
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:maps':
## 
##     ozone
library(RColorBrewer)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ stringr   1.5.0
## ✔ forcats   1.0.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::arrange()   masks plyr::arrange()
## ✖ purrr::compact()   masks plyr::compact()
## ✖ dplyr::count()     masks plyr::count()
## ✖ dplyr::desc()      masks plyr::desc()
## ✖ dplyr::failwith()  masks plyr::failwith()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::id()        masks plyr::id()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ purrr::map()       masks maps::map()
## ✖ dplyr::mutate()    masks plyr::mutate()
## ✖ dplyr::rename()    masks plyr::rename()
## ✖ dplyr::summarise() masks plyr::summarise()
## ✖ dplyr::summarize() masks plyr::summarize()
## ✖ dplyr::where()     masks ape::where()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggpubr)
## 
## Attaching package: 'ggpubr'
## 
## The following object is masked from 'package:plyr':
## 
##     mutate
## 
## The following object is masked from 'package:ape':
## 
##     rotate
library(rstatix)
## 
## Attaching package: 'rstatix'
## 
## The following objects are masked from 'package:plyr':
## 
##     desc, mutate
## 
## The following object is masked from 'package:stats':
## 
##     filter
library(AMR) #g.test
## Registered S3 method overwritten by 'AMR':
##   method   from  
##   plot.sir igraph
library(car)#multivatiate regression model
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(jtools)#multivatiate regression model
library(interactions)#multivatiate regression model
library(tidyverse)#logistic regression
library(caret)#logistic regression
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(cowplot)
## Warning: package 'cowplot' was built under R version 4.2.3
## 
## Attaching package: 'cowplot'
## 
## The following object is masked from 'package:ggpubr':
## 
##     get_legend
## 
## The following object is masked from 'package:lubridate':
## 
##     stamp
library(epitools) #odd ratio

Association of RT-qPCR RV Ct value with symptoms, age and Sex of the individuals:

PCR_data_RV <- read_csv("PCR_Ct_symp_age_20240130.csv") 
## Rows: 1226 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): Sequence_or_SampleID, AgeGroup, Symptoms, Sex, RV_Ct_value
## dbl  (1): Age
## date (1): CollectionDate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
PCR_data_RV$CollectionMonth <- as.factor(substr(PCR_data_RV$CollectionDate,1,7))
PCR_data_RV$Sex <- as.factor(PCR_data_RV$Sex)
PCR_data_RV$Symptoms <- as.factor(PCR_data_RV$Symptoms)
PCR_data_RV$AgeGroup <- as.factor(PCR_data_RV$AgeGroup)
PCR_data_RV$Age <- as.numeric(PCR_data_RV$Age)
PCR_data_RV$RV_Ct_value <- as.numeric(PCR_data_RV$RV_Ct_value)
## Warning: NAs introduced by coercion
#analysis Ct versus age:
PCR_data_RV %>% group_by(AgeGroup) %>% get_summary_stats(RV_Ct_value, type="mean_sd")
## # A tibble: 4 × 5
##   AgeGroup variable        n  mean    sd
##   <fct>    <fct>       <dbl> <dbl> <dbl>
## 1 18-65    RV_Ct_value   719  27.5  5.98
## 2 5-17     RV_Ct_value   217  25.6  6.20
## 3 less5    RV_Ct_value   209  26.9  5.37
## 4 more65   RV_Ct_value    80  28.5  5.15
#plotting Ct vs age groups
ordered_age_groups  = factor(PCR_data_RV$AgeGroup, levels=c("less5", "5-17", "18-65", "more65"))
A1 <- ggplot(data = PCR_data_RV, mapping = aes(x=ordered_age_groups, y=RV_Ct_value, fill=ordered_age_groups)) + geom_violin() + geom_boxplot(width=.1, color="white") + geom_jitter(color="black", size=0.2, alpha=0.9) + guides(fill = FALSE) + theme_minimal() + ylab("Ct value") + xlab("Age groups")
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#Kruskal test Ct vs age:
res.age.kruskal <- kruskal_test(PCR_data_RV, RV_Ct_value ~ AgeGroup)
res.age.kruskal
## # A tibble: 1 × 6
##   .y.             n statistic    df         p method        
## * <chr>       <int>     <dbl> <int>     <dbl> <chr>         
## 1 RV_Ct_value  1226      23.3     3 0.0000346 Kruskal-Wallis
#calculate effect size
res.age.effsize <- PCR_data_RV %>% kruskal_effsize(RV_Ct_value ~ AgeGroup)
res.age.effsize #(0.01- < 0.06 (small effect), 0.06 - < 0.14 (moderate effect) and >= 0.14 (large effect))
## # A tibble: 1 × 5
##   .y.             n effsize method  magnitude
## * <chr>       <int>   <dbl> <chr>   <ord>    
## 1 RV_Ct_value  1226  0.0166 eta2[H] small
#Wilcoxon’s test - pairwise comparisons between group:
pwc.Ct.age <- ungroup(PCR_data_RV) %>% wilcox_test(RV_Ct_value ~ AgeGroup, p.adjust.method = "bonferroni") 
pwc.Ct.age
## # A tibble: 6 × 9
##   .y.         group1 group2    n1    n2 statistic         p   p.adj p.adj.signif
## * <chr>       <chr>  <chr>  <int> <int>     <dbl>     <dbl>   <dbl> <chr>       
## 1 RV_Ct_value 18-65  5-17     719   217    92644. 0.0000276 1.66e-4 ***         
## 2 RV_Ct_value 18-65  less5    719   209    79317  0.22      1   e+0 ns          
## 3 RV_Ct_value 18-65  more65   719    80    25800. 0.131     7.86e-1 ns          
## 4 RV_Ct_value 5-17   less5    217   209    19422  0.01      6.2 e-2 ns          
## 5 RV_Ct_value 5-17   more65   217    80     6101  0.000086  5.16e-4 ***         
## 6 RV_Ct_value less5  more65   209    80     6903  0.022     1.31e-1 ns
#plotting results:
pwc.Ct.age <- pwc.Ct.age %>% add_xy_position(x = "AgeGroup")
ggboxplot(PCR_data_RV, x = "AgeGroup", y = "RV_Ct_value") +
    stat_pvalue_manual(pwc.Ct.age, hide.ns = TRUE) +
    labs(
        subtitle = get_test_label(res.age.kruskal, detailed = TRUE),
        caption = get_pwc_label(pwc.Ct.age)
    )
## Warning: Removed 1 rows containing non-finite values (`stat_boxplot()`).

#analysis Ct versus symptomatic:
#summary of the dataframe:
PCR_data_RV.NA <- na.omit(PCR_data_RV)
PCR_data_RV.NA$Symptoms <- factor(PCR_data_RV.NA$Symptoms, levels=c("Symptomatic", "Asymptomatic"))

ggboxplot(PCR_data_RV.NA, x="Symptoms", y="RV_Ct_value")

B1 <- ggplot(data = PCR_data_RV.NA, mapping = aes(x=Symptoms, y=RV_Ct_value, fill=Symptoms)) + geom_violin() + geom_boxplot(width=.1, color="white") + geom_jitter(color="black", size=0.2, alpha=0.9) + guides(fill = FALSE) + theme_minimal() + theme(axis.title.x=element_blank()) + ylab("Ct value")

#Kruskal test
res.sym.kruskal <- kruskal_test(PCR_data_RV.NA, RV_Ct_value ~ Symptoms)
res.sym.kruskal
## # A tibble: 1 × 6
##   .y.             n statistic    df        p method        
## * <chr>       <int>     <dbl> <int>    <dbl> <chr>         
## 1 RV_Ct_value   974      46.2     1 1.07e-11 Kruskal-Wallis
#calculate effect size
res.sym.effsize <- PCR_data_RV.NA %>% kruskal_effsize(RV_Ct_value ~ Symptoms)
res.sym.effsize #(0.01- < 0.06 (small effect), 0.06 - < 0.14 (moderate effect) and >= 0.14 (large effect))
## # A tibble: 1 × 5
##   .y.             n effsize method  magnitude
## * <chr>       <int>   <dbl> <chr>   <ord>    
## 1 RV_Ct_value   974  0.0465 eta2[H] small
#Wilcoxon’s test to calculate pairwise comparisons between group levels with corrections for multiple testing:
pwc.ct <- PCR_data_RV.NA %>% 
  wilcox_test(RV_Ct_value ~ Symptoms, p.adjust.method = "bonferroni") 
pwc.ct
## # A tibble: 1 × 7
##   .y.         group1      group2          n1    n2 statistic        p
## * <chr>       <chr>       <chr>        <int> <int>     <dbl>    <dbl>
## 1 RV_Ct_value Symptomatic Asymptomatic   662   312    75430. 1.07e-11
#visualization: box plots with p-values:
pwc.ct <- pwc.ct %>% add_xy_position(x = "Symptoms")
ggboxplot(PCR_data_RV.NA, x = "Symptoms", y = "RV_Ct_value") +
  stat_pvalue_manual(pwc.ct, hide.ns = TRUE) +
  labs(
    subtitle = get_test_label(res.sym.kruskal, detailed = TRUE),
    caption = get_pwc_label(pwc.ct)
    )

#Age versus symptoms
PCR_data_RV.NA %>% group_by(Symptoms) %>%
          get_summary_stats(Age, type="mean_sd")
## # A tibble: 2 × 5
##   Symptoms     variable     n  mean    sd
##   <fct>        <fct>    <dbl> <dbl> <dbl>
## 1 Symptomatic  Age        662  26.1  19.0
## 2 Asymptomatic Age        312  27.1  20.5
ggboxplot(PCR_data_RV.NA, x="Symptoms", y="Age")

C1 <-ggplot(data = PCR_data_RV.NA, mapping = aes(x=Symptoms, y=Age, fill=Symptoms)) + geom_violin() + geom_boxplot(width=.1, color="white") + geom_jitter(color="black", size=0.2, alpha=0.9) + guides(fill = FALSE) + theme_minimal() + theme(axis.title.x=element_blank()) + ylab("Age")

#Kruskal test
res.ageSYM.kruskal <- kruskal_test(PCR_data_RV.NA, Age ~ Symptoms)
res.ageSYM.kruskal
## # A tibble: 1 × 6
##   .y.       n statistic    df     p method        
## * <chr> <int>     <dbl> <int> <dbl> <chr>         
## 1 Age     974     0.156     1 0.693 Kruskal-Wallis
#multivariate multiple regression model to study Ct, age and symptomatic
lm1 <- lm(RV_Ct_value~Symptoms+Age, data=PCR_data_RV.NA)
Anova(lm1)
## Anova Table (Type II tests)
## 
## Response: RV_Ct_value
##           Sum Sq  Df F value    Pr(>F)    
## Symptoms    1574   1 47.9635 7.891e-12 ***
## Age          151   1  4.5935   0.03234 *  
## Residuals  31858 971                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#plot the model
avPlots(lm1)

#identify the formula:
summ(lm1, confint=TRUE, digits=3) #in this case the formula is: Ct= 27.892 + 0.018 x age - 2.715 x symptomatic (0=asymptomatic, 1=symptomatic: categorical variables are choose as baseline depending on alphabetical order)
Observations 974
Dependent variable RV_Ct_value
Type OLS linear regression
F(2,971) 26.649
R² 0.052
Adj. R² 0.050
Est. 2.5% 97.5% t val. p
(Intercept) 25.133 24.483 25.784 75.826 0.000
SymptomsAsymptomatic 2.725 1.953 3.497 6.926 0.000
Age 0.020 0.002 0.039 2.143 0.032
Standard errors: OLS
#plot
interact_plot(lm1, pred=Age, modx=Symptoms, plot.points=TRUE, point.alpha=0.1, interval = TRUE, int.type = "confidence", int.width = .95) + ylab("Ct value")
## Warning: Age and Symptoms are not included in an interaction with one another in the
## model.

D1 <- interact_plot(lm1, pred=Age, modx=Symptoms, plot.points=TRUE, point.alpha=0.1, interval = TRUE, int.type = "confidence", int.width = .95) + ylab("Ct value") + theme(legend.position = "none")
## Warning: Age and Symptoms are not included in an interaction with one another in the
## model.
#verify normality of residuals
qqnorm(lm1$residuals)
qqline(lm1$residuals)

#save the plot:
plot_grid(A1, B1, C1, D1, labels = c('A', 'B', 'C', 'D'), align = "h")
## Warning: Removed 1 rows containing non-finite values (`stat_ydensity()`).
## Warning: Removed 1 rows containing non-finite values (`stat_boxplot()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).

PCR_data_RV$YEAR <- as.factor(substr(PCR_data_RV$CollectionDate,1,4))
PCR_data_RV_2022 <- subset(PCR_data_RV, YEAR=="2022")

#analysis Ct versus age:
PCR_data_RV_2022 %>% group_by(AgeGroup) %>% get_summary_stats(RV_Ct_value, type="mean_sd")
## # A tibble: 4 × 5
##   AgeGroup variable        n  mean    sd
##   <fct>    <fct>       <dbl> <dbl> <dbl>
## 1 18-65    RV_Ct_value   516  29.4  5.47
## 2 5-17     RV_Ct_value    91  30.2  5.34
## 3 less5    RV_Ct_value   141  29.1  4.57
## 4 more65   RV_Ct_value    71  29.3  4.81
#plotting Ct vs age groups
ordered_age_groups_2022  = factor(PCR_data_RV_2022$AgeGroup, levels=c("less5", "5-17", "18-65", "more65"))
A1_2022 <- ggplot(data = PCR_data_RV_2022, mapping = aes(x=ordered_age_groups_2022, y=RV_Ct_value, fill=ordered_age_groups_2022)) + geom_violin() + geom_boxplot(width=.1, color="white") + geom_jitter(color="black", size=0.2, alpha=0.9) + guides(fill = FALSE) + theme_minimal() + ylab("Ct value") + xlab("Age groups")

#Kruskal test Ct vs age:
res.age.kruskal.2022 <- kruskal_test(PCR_data_RV_2022, RV_Ct_value ~ AgeGroup)
res.age.kruskal.2022
## # A tibble: 1 × 6
##   .y.             n statistic    df     p method        
## * <chr>       <int>     <dbl> <int> <dbl> <chr>         
## 1 RV_Ct_value   819      2.06     3 0.561 Kruskal-Wallis
#calculate effect size
res.age.effsize.2022 <- PCR_data_RV_2022 %>% kruskal_effsize(RV_Ct_value ~ AgeGroup)
res.age.effsize.2022 #(0.01- < 0.06 (small effect), 0.06 - < 0.14 (moderate effect) and >= 0.14 (large effect))
## # A tibble: 1 × 5
##   .y.             n  effsize method  magnitude
## * <chr>       <int>    <dbl> <chr>   <ord>    
## 1 RV_Ct_value   819 -0.00116 eta2[H] small
#Wilcoxon’s test - pairwise comparisons between group:
pwc.Ct.age.2022 <- ungroup(PCR_data_RV_2022) %>% wilcox_test(RV_Ct_value ~ AgeGroup, p.adjust.method = "bonferroni") 
pwc.Ct.age.2022
## # A tibble: 6 × 9
##   .y.         group1 group2    n1    n2 statistic     p p.adj p.adj.signif
## * <chr>       <chr>  <chr>  <int> <int>     <dbl> <dbl> <dbl> <chr>       
## 1 RV_Ct_value 18-65  5-17     516    91    22008  0.341 1     ns          
## 2 RV_Ct_value 18-65  less5    516   141    38006. 0.415 1     ns          
## 3 RV_Ct_value 18-65  more65   516    71    18518  0.882 1     ns          
## 4 RV_Ct_value 5-17   less5     91   141     7174. 0.129 0.774 ns          
## 5 RV_Ct_value 5-17   more65    91    71     3469  0.422 1     ns          
## 6 RV_Ct_value less5  more65   141    71     4814. 0.651 1     ns
#plotting results:
pwc.Ct.age.2022 <- pwc.Ct.age.2022 %>% add_xy_position(x = "AgeGroup")
ggboxplot(PCR_data_RV_2022, x = "AgeGroup", y = "RV_Ct_value") +
    stat_pvalue_manual(pwc.Ct.age.2022, hide.ns = TRUE) +
    labs(
        subtitle = get_test_label(res.age.kruskal.2022, detailed = TRUE),
        caption = get_pwc_label(pwc.Ct.age.2022)
    )

#analysis Ct versus symptomatic:
#summary of the dataframe:
PCR_data_RV.NA_2022 <- na.omit(PCR_data_RV_2022)
PCR_data_RV.NA_2022$Symptoms <- factor(PCR_data_RV.NA_2022$Symptoms, levels=c("Symptomatic", "Asymptomatic"))

ggboxplot(PCR_data_RV.NA_2022, x="Symptoms", y="RV_Ct_value")

B1_2022 <- ggplot(data = PCR_data_RV.NA_2022, mapping = aes(x=Symptoms, y=RV_Ct_value, fill=Symptoms)) + geom_violin() + geom_boxplot(width=.1, color="white") + geom_jitter(color="black", size=0.2, alpha=0.9) + guides(fill = FALSE) + theme_minimal() + theme(axis.title.x=element_blank()) + ylab("Ct value")

#Kruskal test
res.sym.kruskal_2022 <- kruskal_test(PCR_data_RV.NA_2022, RV_Ct_value ~ Symptoms)
res.sym.kruskal_2022
## # A tibble: 1 × 6
##   .y.             n statistic    df           p method        
## * <chr>       <int>     <dbl> <int>       <dbl> <chr>         
## 1 RV_Ct_value   601      24.9     1 0.000000592 Kruskal-Wallis
#calculate effect size
res.sym.effsize_2022 <- PCR_data_RV.NA_2022 %>% kruskal_effsize(RV_Ct_value ~ Symptoms)
res.sym.effsize_2022 #(0.01- < 0.06 (small effect), 0.06 - < 0.14 (moderate effect) and >= 0.14 (large effect))
## # A tibble: 1 × 5
##   .y.             n effsize method  magnitude
## * <chr>       <int>   <dbl> <chr>   <ord>    
## 1 RV_Ct_value   601  0.0400 eta2[H] small
#Wilcoxon’s test to calculate pairwise comparisons between group levels with corrections for multiple testing:
pwc.ct_2022 <- PCR_data_RV.NA_2022 %>% 
  wilcox_test(RV_Ct_value ~ Symptoms, p.adjust.method = "bonferroni") 
pwc.ct_2022
## # A tibble: 1 × 7
##   .y.         group1      group2          n1    n2 statistic           p
## * <chr>       <chr>       <chr>        <int> <int>     <dbl>       <dbl>
## 1 RV_Ct_value Symptomatic Asymptomatic   376   225     32012 0.000000593
#visualization: box plots with p-values:
pwc.ct_2022 <- pwc.ct_2022 %>% add_xy_position(x = "Symptoms")
ggboxplot(PCR_data_RV.NA_2022, x = "Symptoms", y = "RV_Ct_value") +
  stat_pvalue_manual(pwc.ct_2022, hide.ns = TRUE) +
  labs(
    subtitle = get_test_label(res.sym.kruskal_2022, detailed = TRUE),
    caption = get_pwc_label(pwc.ct_2022)
    )

#Age versus symptoms
PCR_data_RV.NA_2022 %>% group_by(Symptoms) %>%
          get_summary_stats(Age, type="mean_sd")
## # A tibble: 2 × 5
##   Symptoms     variable     n  mean    sd
##   <fct>        <fct>    <dbl> <dbl> <dbl>
## 1 Symptomatic  Age        376  28.3  19.6
## 2 Asymptomatic Age        225  28.8  21.6
ggboxplot(PCR_data_RV.NA_2022, x="Symptoms", y="Age")

C1_2022 <-ggplot(data = PCR_data_RV.NA_2022, mapping = aes(x=Symptoms, y=Age, fill=Symptoms)) + geom_violin() + geom_boxplot(width=.1, color="white") + geom_jitter(color="black", size=0.2, alpha=0.9) + guides(fill = FALSE) + theme_minimal() + theme(axis.title.x=element_blank()) + ylab("Age")

#Kruskal test
res.ageSYM.kruskal_2022 <- kruskal_test(PCR_data_RV.NA_2022, Age ~ Symptoms)
res.ageSYM.kruskal_2022
## # A tibble: 1 × 6
##   .y.       n statistic    df     p method        
## * <chr> <int>     <dbl> <int> <dbl> <chr>         
## 1 Age     601    0.0306     1 0.861 Kruskal-Wallis
#multivariate multiple regression model to study Ct, age and symptomatic
lm1_2022 <- lm(RV_Ct_value~Symptoms+Age, data=PCR_data_RV.NA_2022)
Anova(lm1_2022)
## Anova Table (Type II tests)
## 
## Response: RV_Ct_value
##            Sum Sq  Df F value    Pr(>F)    
## Symptoms    689.5   1 25.4618 5.995e-07 ***
## Age           7.7   1  0.2829     0.595    
## Residuals 16194.5 598                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#plot the model
avPlots(lm1_2022)

#identify the formula:
summ(lm1_2022, confint=TRUE, digits=3) #in this case the formula is: Ct= 27.892 + 0.018 x age - 2.715 x symptomatic (0=asymptomatic, 1=symptomatic: categorical variables are choose as baseline depending on alphabetical order)
Observations 601
Dependent variable RV_Ct_value
Type OLS linear regression
F(2,598) 12.842
R² 0.041
Adj. R² 0.038
Est. 2.5% 97.5% t val. p
(Intercept) 28.503 27.720 29.286 71.479 0.000
SymptomsAsymptomatic 2.213 1.352 3.075 5.046 0.000
Age -0.006 -0.026 0.015 -0.532 0.595
Standard errors: OLS
#plot
interact_plot(lm1_2022, pred=Age, modx=Symptoms, plot.points=TRUE, point.alpha=0.1, interval = TRUE, int.type = "confidence", int.width = .95) + ylab("Ct value")
## Warning: Age and Symptoms are not included in an interaction with one another in the
## model.

D1_2022 <- interact_plot(lm1_2022, pred=Age, modx=Symptoms, plot.points=TRUE, point.alpha=0.1, interval = TRUE, int.type = "confidence", int.width = .95) + ylab("Ct value") + theme(legend.position = "none")
## Warning: Age and Symptoms are not included in an interaction with one another in the
## model.
#verify normality of residuals
qqnorm(lm1_2022$residuals)
qqline(lm1_2022$residuals)

#save the plot:
plot_grid(A1_2022, B1_2022, C1_2022, D1_2022, labels = c('A', 'B', 'C', 'D'), align = "h")

RV seasonality plots with the Washington State genomes:

#Loading the dataframe:
season_data_RV <- read_csv("RV-dataframe_20240130.csv") 
## Rows: 1003 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (6): Sequence, Genotype, Sex, RVSpecie, Clinica, AgeGroup
## dbl  (4): StudyPeriod, Age, RV_Ct_value, ZipCode
## date (1): CollectionDate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
season_data_RV$CollectionMonth<- as.factor(substr(season_data_RV$CollectionDate,1,7))
season_data_RV$Genotype <- as.factor(season_data_RV$Genotype)
season_data_RV$Sex <- as.factor(season_data_RV$Sex)
season_data_RV$RVSpecie <- as.factor(season_data_RV$RVSpecie)
season_data_RV$Clinica <- as.factor(season_data_RV$Clinica)
season_data_RV$StudyPeriod <- as.factor(season_data_RV$StudyPeriod)
season_data_RV$AgeGroup <- as.factor(season_data_RV$AgeGroup)
season_data_RV$Age <- as.numeric(season_data_RV$Age)
season_data_RV$RV_Ct_value <- as.numeric(season_data_RV$RV_Ct_value)
season_data_RV$ZipCode <- as.factor(season_data_RV$ZipCode)

# ----->Bubble chart of seasonality of RV genotypes of all the species together. It is needed an Excel file containing a column for the sequence name and a column for the genotype:
condensed_season_data_RV <- ddply(season_data_RV,.(Genotype,CollectionMonth),nrow)

ggplot(condensed_season_data_RV, aes(x= factor(CollectionMonth), y=Genotype, size=V1)) + geom_point(alpha=0.5) + scale_y_discrete(limits=rev(c("A1B", "A2", "A7", "A9", "A11", "A12", "A13", "A16", "A18", "A20", "A21", "A22", "A23", "A24", "A25", "A28", "A29", "A30", "A31", "A32", "A33", "A34", "A38", "A39", "A45", "A46", "A47", "A49", "A53", "A54", "A58", "A59", "A60", "A61", "A62", "A63", "A64", "A66", "A67", "A68", "A73", "A78", "A80", "A85", "A94", "A101", "A105","A111" ,"B3", "B4", "B6", "B14", "B27", "B42", "B48", "B70", "B72", "B83", "B91", "B92", "B100", "B101", "B107", "C1", "C2", "C3", "C4", "C6", "C7", "C8", "C9", "C11", "C13", "C15", "C16", "C17", "C18", "C19", "C20", "C21", "C23", "C25", "C26", "C27", "C28", "C29", "C31", "C33", "C34", "C35", "C36", "C40", "C42", "C43", "C44", "C46", "C53", "C55", "C56"))) + xlab("Collection Date") + theme(axis.text.x = element_text(angle = 45, hjust = 1))

# ----->Barplot of the seasonality of RV species by month of sample collection. 
n <- length(unique(season_data_RV$RVSpecie))
qual_col_pals = brewer.pal.info[brewer.pal.info$category== 'qual',]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))

A2 <- ggplot(season_data_RV, aes(x=CollectionMonth, fill=RVSpecie)) + geom_bar(position = "fill") + theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.title.x=element_blank()) + labs(y="Proportion", fill = "Species") + scale_fill_manual(values = sample(col_vector, n))
A2

# ----->Barplot of the seasonality of RV-A genotypes by month of sample collection. 
season_data_RVA <- subset(season_data_RV, season_data_RV$RVSpecie=="A")
nA <- length(unique(season_data_RVA$Genotype))
qual_col_pals = brewer.pal.info[brewer.pal.info$category== 'qual',]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))

B2 <- ggplot(season_data_RVA, aes(x=CollectionMonth, fill=Genotype)) + geom_bar(position = "fill") + theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.title.x=element_blank()) + labs(y="Proportion", fill = "RV-A Genotype") + scale_fill_manual(values = sample(col_vector, nA), breaks = c("A1B", "A2", "A7", "A9", "A11", "A12", "A13", "A16", "A18", "A20", "A21", "A22", "A23", "A24", "A25", "A28", "A29", "A30", "A31", "A32", "A33", "A34", "A38", "A39", "A45", "A46", "A47", "A49", "A53", "A54", "A58", "A59", "A60", "A61", "A62", "A63", "A64", "A66", "A67", "A68", "A73", "A78", "A80", "A85", "A94", "A101", "A105", "A111"))
B2

# ----->Barplot of the seasonality of RV-B genotypes by month of sample collection. 
season_data_RVB <- subset(season_data_RV, season_data_RV$RVSpecie=="B")
nB <- length(unique(season_data_RVB$Genotype))
qual_col_pals = brewer.pal.info[brewer.pal.info$category== 'qual',]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))

C2 <- ggplot(season_data_RVB, aes(x=CollectionMonth, fill=Genotype)) + geom_bar(position = "fill") + theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.title.x=element_blank()) + labs(y="Proportion", fill = "RV-B Genotype") + scale_fill_manual(values = sample(col_vector, nB), breaks = c("B3", "B4", "B6", "B14", "B27", "B42", "B48", "B70", "B72", "B83", "B91", "B92", "B100", "B101", "B107"))
C2

# ----->Barplot of the seasonality of RV-C genotypes by month of sample collection. 
season_data_RVC <- subset(season_data_RV, season_data_RV$RVSpecie=="C")
nC <- length(unique(season_data_RVC$Genotype))
qual_col_pals = brewer.pal.info[brewer.pal.info$category== 'qual',]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))

D2 <- ggplot(season_data_RVC, aes(x=CollectionMonth, fill=Genotype)) + geom_bar(position = "fill") + theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.title.x=element_blank()) + labs(y="Proportion", fill = "RV-C Genotype") + scale_fill_manual(values = sample(col_vector, nC), breaks = c("C1", "C2", "C3", "C4", "C6", "C7", "C8", "C9", "C11", "C13", "C15", "C16", "C17", "C18", "C19", "C20", "C21", "C23", "C25", "C26", "C27", "C28", "C29", "C31", "C33", "C34", "C35", "C36", "C40", "C42", "C43", "C44", "C46", "C53", "C55", "C56")) 
D2

Assocaition of RV species and genotypes with with clinical and epidemiological characteristics:

#RV species versus individuals age
ggplot(season_data_RV, aes(x=RVSpecie, y=Age, fill=RVSpecie)) + geom_violin(width=1) + geom_boxplot(width=0.1, color="white", alpha=0.2) + geom_jitter(color="black", size=0.4, alpha=0.9) + theme_minimal()
## Warning: Removed 477 rows containing non-finite values (`stat_ydensity()`).
## Warning: Removed 477 rows containing non-finite values (`stat_boxplot()`).
## Warning: Removed 477 rows containing missing values (`geom_point()`).

#RV genotypes within RV species versus individuals age
season_data_RVA <- subset(season_data_RV, subset = RVSpecie == "A")
season_data_RVB <- subset(season_data_RV, subset = RVSpecie == "B")
season_data_RVC <- subset(season_data_RV, subset = RVSpecie == "C")

ggplot(season_data_RVA, aes(x=Genotype, y=Age)) + geom_violin(width=1) + geom_boxplot(width=0.1, color="black", alpha=0.2) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Removed 330 rows containing non-finite values (`stat_ydensity()`).
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Warning: Removed 330 rows containing non-finite values (`stat_boxplot()`).

ggplot(season_data_RVB, aes(x=Genotype, y=Age)) + geom_violin(width=1) + geom_boxplot(width=0.1, color="black", alpha=0.2) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Removed 54 rows containing non-finite values (`stat_ydensity()`).
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Warning: Removed 54 rows containing non-finite values (`stat_boxplot()`).

ggplot(season_data_RVC, aes(x=Genotype, y=Age)) + geom_violin(width=1) + geom_boxplot(width=0.1, color="black", alpha=0.2) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Removed 93 rows containing non-finite values (`stat_ydensity()`).
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Warning: Removed 93 rows containing non-finite values (`stat_boxplot()`).

#Age Group and Sex by RV species
df_clinica <- season_data_RV[!is.na(season_data_RV$Age),]
ggplot(df_clinica, aes(x=RVSpecie, fill=AgeGroup)) + geom_bar(position = "dodge") 

ggplot(df_clinica, aes(x=AgeGroup, fill=RVSpecie)) + geom_bar(position = "fill") + scale_x_discrete(limits = c("less5", "5-17", "18-65", "more65"))

ggplot(df_clinica, aes(x=RVSpecie, fill=Sex)) + geom_bar(position = "dodge")

#statistical test
  df_clinica_Sex.spe <- as.matrix(table(df_clinica$RVSpecie, df_clinica$Sex))  
  g.test(df_clinica_Sex.spe)
## 
##  G-test of independence
## 
## data:  df_clinica_Sex.spe
## X-squared = 2.38, p-value = 0.3042
  df_clinica_age.spe <- as.matrix(table(df_clinica$RVSpecie, df_clinica$AgeGroup))  
  g.test(df_clinica_age.spe)
## Warning in g.test(df_clinica_age.spe): G-statistic approximation may be
## incorrect due to E < 5
## 
##  G-test of independence
## 
## data:  df_clinica_age.spe
## X-squared = 26.064, p-value = 0.0002166
fisher.test(df_clinica$RVSpecie, df_clinica$AgeGroup,simulate.p.value = TRUE)
## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  2000 replicates)
## 
## data:  df_clinica$RVSpecie and df_clinica$AgeGroup
## p-value = 0.0009995
## alternative hypothesis: two.sided
#pairwise comparison in <5years old
AB_less5 <-matrix(c(31,148+69+10,1,42+16+6),nrow = 2, ncol = 2, dimnames=list(RV= c("A", "B"), AgeGroup = c("less5", "above5"))) #RV-A vs RV-B
oddsratio.fisher(AB_less5)
## $data
##        AgeGroup
## RV      less5 above5 Total
##   A        31      1    32
##   B       227     64   291
##   Total   258     65   323
## 
## $measure
##    odds ratio with 95% C.I.
## RV  estimate    lower    upper
##   A 1.000000       NA       NA
##   B 8.707892 1.393362 361.1631
## 
## $p.value
##    two-sided
## RV   midp.exact fisher.exact chi.square
##   A          NA           NA         NA
##   B 0.005542377  0.009158696 0.01150801
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "Conditional MLE & exact CI from 'fisher.test'"
AC_less5 <- matrix(c(44,106+41+12,31,148+69+10),nrow = 2, ncol = 2, dimnames=list(RV= c("C", "A"), AgeGroup = c("less5", "above5"))) #RV-A vs RV-C
oddsratio.fisher(AC_less5)
## $data
##        AgeGroup
## RV      less5 above5 Total
##   C        44     31    75
##   A       159    227   386
##   Total   203    258   461
## 
## $measure
##    odds ratio with 95% C.I.
## RV  estimate    lower    upper
##   C 1.000000       NA       NA
##   A 2.023234 1.191306 3.470825
## 
## $p.value
##    two-sided
## RV   midp.exact fisher.exact  chi.square
##   C          NA           NA          NA
##   A 0.005788502  0.007317769 0.005278256
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "Conditional MLE & exact CI from 'fisher.test'"
BC_less5 <- matrix(c(44,106+41+12,1,42+16+6),nrow = 2, ncol = 2, dimnames=list(RV= c("C", "B"), AgeGroup = c("less5", "above5"))) #RV-B vs RV-C
oddsratio.fisher(BC_less5)
## $data
##        AgeGroup
## RV      less5 above5 Total
##   C        44      1    45
##   B       159     64   223
##   Total   203     65   268
## 
## $measure
##    odds ratio with 95% C.I.
## RV  estimate    lower    upper
##   C  1.00000       NA       NA
##   B 17.61096 2.866409 723.9207
## 
## $p.value
##    two-sided
## RV    midp.exact fisher.exact   chi.square
##   C           NA           NA           NA
##   B 1.948796e-05 3.335026e-05 0.0001568077
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "Conditional MLE & exact CI from 'fisher.test'"
#pairwise comparison in 5-17 years old
AB_517 <-matrix(c(69,148+31+10,16,42+1+6),nrow = 2, ncol = 2, dimnames=list(RV= c("A", "B"), AgeGroup = c("5-17", "no5-17"))) #RV-A vs RV-B
oddsratio.fisher(AB_517)
## $data
##        AgeGroup
## RV      5-17 no5-17 Total
##   A       69     16    85
##   B      189     49   238
##   Total  258     65   323
## 
## $measure
##    odds ratio with 95% C.I.
## RV  estimate   lower    upper
##   A 1.000000      NA       NA
##   B 1.117693 0.57853 2.249721
## 
## $p.value
##    two-sided
## RV  midp.exact fisher.exact chi.square
##   A         NA           NA         NA
##   B  0.7407271    0.8748545  0.7275839
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "Conditional MLE & exact CI from 'fisher.test'"
AC_517 <- matrix(c(69,148+31+10,41,106+44+12),nrow = 2, ncol = 2, dimnames=list(RV= c("A", "C"), AgeGroup = c("5-17", "no5-17"))) #RV-A vs RV-C
oddsratio.fisher(AC_517)
## $data
##        AgeGroup
## RV      5-17 no5-17 Total
##   A       69     41   110
##   C      189    162   351
##   Total  258    203   461
## 
## $measure
##    odds ratio with 95% C.I.
## RV  estimate     lower    upper
##   A 1.000000        NA       NA
##   C 1.441364 0.9099605 2.303174
## 
## $p.value
##    two-sided
## RV  midp.exact fisher.exact chi.square
##   A         NA           NA         NA
##   C  0.1025923     0.123167   0.101582
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "Conditional MLE & exact CI from 'fisher.test'"
BC_517 <- matrix(c(16,42+1+6,41,106+44+12),nrow = 2, ncol = 2, dimnames=list(RV= c("B", "C"), AgeGroup = c("5-17", "no5-17"))) #RV-B vs RV-C
oddsratio.fisher(BC_517)
## $data
##        AgeGroup
## RV      5-17 no5-17 Total
##   B       16     41    57
##   C       49    162   211
##   Total   65    203   268
## 
## $measure
##    odds ratio with 95% C.I.
## RV  estimate     lower    upper
##   B 1.000000        NA       NA
##   C 1.288952 0.6195165 2.593063
## 
## $p.value
##    two-sided
## RV  midp.exact fisher.exact chi.square
##   B         NA           NA         NA
##   C  0.4511888    0.4870479  0.4486773
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "Conditional MLE & exact CI from 'fisher.test'"
#pairwise comparison in 18-65 years old
AB_1865 <-matrix(c(148,69+31+10,42,16+1+6),nrow = 2, ncol = 2, dimnames=list(RV= c("A", "B"), AgeGroup = c("18-65", "no18-65"))) #RV-A vs RV-B
oddsratio.fisher(AB_1865)
## $data
##        AgeGroup
## RV      18-65 no18-65 Total
##   A       148      42   190
##   B       110      23   133
##   Total   258      65   323
## 
## $measure
##    odds ratio with 95% C.I.
## RV   estimate    lower    upper
##   A 1.0000000       NA       NA
##   B 0.7374844 0.398554 1.338763
## 
## $p.value
##    two-sided
## RV  midp.exact fisher.exact chi.square
##   A         NA           NA         NA
##   B  0.2929019    0.3250536   0.288412
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "Conditional MLE & exact CI from 'fisher.test'"
AC_1865 <- matrix(c(148,69+31+10,106,41+44+12),nrow = 2, ncol = 2, dimnames=list(RV= c("A", "C"), AgeGroup = c("18-65", "no18-65"))) #RV-A vs RV-C
oddsratio.fisher(AC_1865)
## $data
##        AgeGroup
## RV      18-65 no18-65 Total
##   A       148     106   254
##   C       110      97   207
##   Total   258     203   461
## 
## $measure
##    odds ratio with 95% C.I.
## RV  estimate     lower    upper
##   A 1.000000        NA       NA
##   C 1.230651 0.8361744 1.812462
## 
## $p.value
##    two-sided
## RV  midp.exact fisher.exact chi.square
##   A         NA           NA         NA
##   C  0.2721299    0.2997226  0.2699886
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "Conditional MLE & exact CI from 'fisher.test'"
BC_1865 <- matrix(c(42,16+1+6,106,41+44+12),nrow = 2, ncol = 2, dimnames=list(RV= c("B", "C"), AgeGroup = c("18-65", "no18-65"))) #RV-B vs RV-C
oddsratio.fisher(BC_1865)
## $data
##        AgeGroup
## RV      18-65 no18-65 Total
##   B        42     106   148
##   C        23      97   120
##   Total    65     203   268
## 
## $measure
##    odds ratio with 95% C.I.
## RV  estimate     lower    upper
##   B 1.000000        NA       NA
##   C 1.667863 0.9052842 3.130992
## 
## $p.value
##    two-sided
## RV  midp.exact fisher.exact chi.square
##   B         NA           NA         NA
##   C 0.08192447   0.08705064 0.08019727
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "Conditional MLE & exact CI from 'fisher.test'"
#pairwise comparison in more65 years old
AB_more65 <-matrix(c(10,69+31+148,6,16+1+42),nrow = 2, ncol = 2, dimnames=list(RV= c("A", "B"), AgeGroup = c("more65", "below65"))) #RV-A vs RV-B
oddsratio.fisher(AB_more65)
## Warning in chisq.test(xx, correct = correction): Chi-squared approximation may
## be incorrect
## $data
##        AgeGroup
## RV      more65 below65 Total
##   A         10       6    16
##   B        248      59   307
##   Total    258      65   323
## 
## $measure
##    odds ratio with 95% C.I.
## RV   estimate     lower    upper
##   A 1.0000000        NA       NA
##   B 0.3978969 0.1250806 1.387523
## 
## $p.value
##    two-sided
## RV  midp.exact fisher.exact chi.square
##   A         NA           NA         NA
##   B  0.1027628    0.1034038 0.07537015
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "Conditional MLE & exact CI from 'fisher.test'"
AC_more65 <- matrix(c(10,69+31+148,12,41+44+106),nrow = 2, ncol = 2, dimnames=list(RV= c("A", "C"), AgeGroup = c("more65", "below65"))) #RV-A vs RV-C
oddsratio.fisher(AC_more65)
## $data
##        AgeGroup
## RV      more65 below65 Total
##   A         10      12    22
##   C        248     191   439
##   Total    258     203   461
## 
## $measure
##    odds ratio with 95% C.I.
## RV   estimate     lower    upper
##   A 1.0000000        NA       NA
##   C 0.6424378 0.2430212 1.661239
## 
## $p.value
##    two-sided
## RV  midp.exact fisher.exact chi.square
##   A         NA           NA         NA
##   C  0.3202901    0.3801073  0.3088357
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "Conditional MLE & exact CI from 'fisher.test'"
BC_more65 <- matrix(c(6,16+1+42,12,41+44+106),nrow = 2, ncol = 2, dimnames=list(RV= c("B", "C"), AgeGroup = c("more65", "below65"))) #RV-B vs RV-C
oddsratio.fisher(BC_more65)
## Warning in chisq.test(xx, correct = correction): Chi-squared approximation may
## be incorrect
## $data
##        AgeGroup
## RV      more65 below65 Total
##   B          6      12    18
##   C         59     191   250
##   Total     65     203   268
## 
## $measure
##    odds ratio with 95% C.I.
## RV  estimate     lower    upper
##   B 1.000000        NA       NA
##   C 1.615435 0.4760161 4.898714
## 
## $p.value
##    two-sided
## RV  midp.exact fisher.exact chi.square
##   B         NA           NA         NA
##   C  0.3661566     0.393684  0.3520965
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "Conditional MLE & exact CI from 'fisher.test'"
#Symptoms versus RV species
df_clinica.SYM <- df_clinica[!is.na(df_clinica$Clinica),]
ggplot(df_clinica.SYM, aes(x=Clinica, fill=RVSpecie)) + geom_bar(position = "dodge")

#statistical test
  df_clinica_sym.spe <- as.matrix(table(df_clinica.SYM$RVSpecie, df_clinica.SYM$Clinica))  
  g.test(df_clinica_sym.spe)
## 
##  G-test of independence
## 
## data:  df_clinica_sym.spe
## X-squared = 0.2215, p-value = 0.8952
#Odd ratio for Symptoms and Ct<25
df_clinica.SYM$Ct25 <- ifelse(df_clinica.SYM$RV_Ct_value<25, "below25", "above25")
oddsratio.wald(table(df_clinica.SYM$Clinica, df_clinica.SYM$Ct25))
## $data
##               
##                above25 below25 Total
##   Asymptomatic      39      70   109
##   Symptomatic       88     258   346
##   Total            127     328   455
## 
## $measure
##               odds ratio with 95% C.I.
##                estimate    lower   upper
##   Asymptomatic 1.000000       NA      NA
##   Symptomatic  1.633442 1.030807 2.58839
## 
## $p.value
##               two-sided
##                midp.exact fisher.exact chi.square
##   Asymptomatic         NA           NA         NA
##   Symptomatic  0.03949042   0.03820375 0.03573582
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "Unconditional MLE & normal approximation (Wald) CI"
oddsratio.fisher(table(df_clinica.SYM$Clinica, df_clinica.SYM$Ct25))
## $data
##               
##                above25 below25 Total
##   Asymptomatic      39      70   109
##   Symptomatic       88     258   346
##   Total            127     328   455
## 
## $measure
##               odds ratio with 95% C.I.
##                estimate     lower    upper
##   Asymptomatic 1.000000        NA       NA
##   Symptomatic  1.631504 0.9987592 2.645774
## 
## $p.value
##               two-sided
##                midp.exact fisher.exact chi.square
##   Asymptomatic         NA           NA         NA
##   Symptomatic  0.03949042   0.03820375 0.03573582
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "Conditional MLE & exact CI from 'fisher.test'"
#Symptoms by age group per RV species
df_clinica.SYMA <- subset(df_clinica.SYM, subset = RVSpecie == "A")
df_clinica.SYMB <- subset(df_clinica.SYM, subset = RVSpecie == "B")
df_clinica.SYMC <- subset(df_clinica.SYM, subset = RVSpecie == "C")

ggplot(df_clinica.SYMA, aes(x=AgeGroup, fill=Clinica)) + geom_bar(position = "fill") + scale_x_discrete(limits = c("less5", "5-17", "18-65", "more65"))

    df_clinica.SYMA_sym.age <- as.matrix(table(df_clinica.SYMA$Clinica, df_clinica.SYMA$AgeGroup))  
    g.test(df_clinica.SYMA_sym.age)
## Warning in g.test(df_clinica.SYMA_sym.age): G-statistic approximation may be
## incorrect due to E < 5
## 
##  G-test of independence
## 
## data:  df_clinica.SYMA_sym.age
## X-squared = 1.2136, p-value = 0.7497
ggplot(df_clinica.SYMB, aes(x=AgeGroup, fill=Clinica)) + geom_bar(position = "fill") + scale_x_discrete(limits = c("less5", "5-17", "18-65", "more65"))

    df_clinica.SYMB_sym.age <- as.matrix(table(df_clinica.SYMB$Clinica, df_clinica.SYMB$AgeGroup))  
    g.test(df_clinica.SYMB_sym.age)
## Warning in g.test(df_clinica.SYMB_sym.age): G-statistic approximation may be
## incorrect due to E < 5
## 
##  G-test of independence
## 
## data:  df_clinica.SYMB_sym.age
## X-squared = 2.9685, p-value = 0.3965
ggplot(df_clinica.SYMC, aes(x=AgeGroup, fill=Clinica)) + geom_bar(position = "fill") + scale_x_discrete(limits = c("less5", "5-17", "18-65", "more65"))

    df_clinica.SYMC_sym.age <- as.matrix(table(df_clinica.SYMC$Clinica, df_clinica.SYMC$AgeGroup))  
    g.test(df_clinica.SYMC_sym.age)
## Warning in g.test(df_clinica.SYMC_sym.age): G-statistic approximation may be
## incorrect due to E < 5
## 
##  G-test of independence
## 
## data:  df_clinica.SYMC_sym.age
## X-squared = 3.7501, p-value = 0.2897
#logistic regressions to evaluate association with geographic location and individuals age:
season_data_RV %>% group_by(ZipCode) %>%
get_summary_stats(Age, type="mean_sd") %>% print(n = 25)
## # A tibble: 24 × 5
##    ZipCode variable     n  mean    sd
##    <fct>   <fct>    <dbl> <dbl> <dbl>
##  1 98001   Age         32  30.5 19.1 
##  2 98003   Age          1  41   NA   
##  3 98006   Age          1  44   NA   
##  4 98007   Age         99  30.7 20.5 
##  5 98021   Age          1  53   NA   
##  6 98036   Age         21  25.9 20.5 
##  7 98037   Age          3  32   31.0 
##  8 98103   Age          1  52   NA   
##  9 98104   Age          7  40.9 16.3 
## 10 98105   Age          1  69   NA   
## 11 98107   Age          4  13.2  9.18
## 12 98109   Age          4  33   25.6 
## 13 98116   Age         31  29.3 18.7 
## 14 98118   Age         15  28.5 21.2 
## 15 98133   Age        106  26.2 20.3 
## 16 98134   Age         43  29   17.5 
## 17 98144   Age          1  24   NA   
## 18 98155   Age          5  46.2  9.76
## 19 98195   Age         80  28.1 21.5 
## 20 98201   Age         14  22   17.6 
## 21 98272   Age         10  21.5 23.2 
## 22 98908   Age          5   9.2 15.6 
## 23 99336   Age          4  21.5 13.2 
## 24 <NA>    Age         37  12.3 17.4
#plotting
season_data_RV_clean <- season_data_RV[!is.na(season_data_RV$ZipCode),]

ggplot(season_data_RV_clean, aes(x=ZipCode, y=Age)) + geom_violin(width=1) + geom_boxplot(width=0.1, color="black", alpha=0.2) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.

#if leaving zip code with more than 5 cases reported:
season_data_RV.5 <- season_data_RV %>%
  group_by(ZipCode) %>%
  filter(n() > 5) %>%
  na.omit(season_data_RV.5$ZipCode)

ggplot(season_data_RV.5, aes(x=ZipCode, y=Age)) +  geom_boxplot(width=0.1, color="black", alpha=0.2) + theme(axis.text.x = element_text(angle = 45, hjust = 1))

#multivariate multiple regression model to study Ct, symptoms and RV SPECIES (using data frame of sequenced samples):
season_data_RV_clean <- season_data_RV[!is.na(season_data_RV$Clinica),]

lm_reg1 <- lm(RV_Ct_value~Clinica+RVSpecie, season_data_RV_clean)
Anova(lm_reg1)
## Anova Table (Type II tests)
## 
## Response: RV_Ct_value
##           Sum Sq  Df F value  Pr(>F)  
## Clinica     84.4   1  4.8519 0.02812 *
## RVSpecie    38.1   2  1.0961 0.33507  
## Residuals 7847.3 451                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#plot the model
avPlots(lm_reg1)

#identify the formula:
summ(lm_reg1, confint=TRUE, digits=3) #in this case the formula is: Ct= 27.892 + 0.018 x age - 2.715 x symptomatic (0=asymptomatic, 1=symptomatic: categorical variables are choose as baseline depending on alphabetical order)
Observations 455
Dependent variable RV_Ct_value
Type OLS linear regression
F(3,451) 2.302
R² 0.015
Adj. R² 0.009
Est. 2.5% 97.5% t val. p
(Intercept) 23.522 22.661 24.383 53.692 0.000
ClinicaSymptomatic -1.009 -1.910 -0.109 -2.203 0.028
RVSpecieB 0.813 -0.413 2.040 1.304 0.193
RVSpecieC 0.439 -0.397 1.275 1.033 0.302
Standard errors: OLS
#verify normality of residuals
qqnorm(lm_reg1$residuals)
qqline(lm_reg1$residuals)

#Ct value and the genotypes (with more than 5 cases detected):
season_data_RV_CT.5 <- season_data_RV %>% group_by(Genotype) %>% filter(n() > 5) %>% filter(RV_Ct_value!="NA")
season_data_RV_CT.5$Genotype<- as.factor(season_data_RV_CT.5$Genotype)
#Kruskal test
res.gen.ct.kruskal <- kruskal_test(season_data_RV_CT.5, season_data_RV_CT.5$RV_Ct_value ~ season_data_RV_CT.5$Genotype)
res.gen.ct.kruskal
## # A tibble: 36 × 7
##    Genotype .y.                                n statistic    df        p method
##  * <fct>    <chr>                          <int>     <dbl> <int>    <dbl> <chr> 
##  1 A101     season_data_RV_CT.5$RV_Ct_val…   100      127.    35 2.06e-12 Krusk…
##  2 A11      season_data_RV_CT.5$RV_Ct_val…     6      127.    35 2.06e-12 Krusk…
##  3 A1B      season_data_RV_CT.5$RV_Ct_val…   206      127.    35 2.06e-12 Krusk…
##  4 A2       season_data_RV_CT.5$RV_Ct_val…     6      127.    35 2.06e-12 Krusk…
##  5 A20      season_data_RV_CT.5$RV_Ct_val…    14      127.    35 2.06e-12 Krusk…
##  6 A22      season_data_RV_CT.5$RV_Ct_val…     9      127.    35 2.06e-12 Krusk…
##  7 A23      season_data_RV_CT.5$RV_Ct_val…    12      127.    35 2.06e-12 Krusk…
##  8 A24      season_data_RV_CT.5$RV_Ct_val…    11      127.    35 2.06e-12 Krusk…
##  9 A25      season_data_RV_CT.5$RV_Ct_val…    24      127.    35 2.06e-12 Krusk…
## 10 A28      season_data_RV_CT.5$RV_Ct_val…     7      127.    35 2.06e-12 Krusk…
## # ℹ 26 more rows
#calculate effect size
res.gen.ct.effsize <- season_data_RV %>% kruskal_effsize(RV_Ct_value ~ Genotype)
res.gen.ct.effsize #(0.01- < 0.06 (small effect), 0.06 - < 0.14 (moderate effect) and >= 0.14 (large effect))
## # A tibble: 1 × 5
##   .y.             n effsize method  magnitude
## * <chr>       <int>   <dbl> <chr>   <ord>    
## 1 RV_Ct_value  1003   0.159 eta2[H] large
#All genotypes versus Ct value
ggboxplot(season_data_RV, x = "Genotype", y = "RV_Ct_value") + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8))

#Genotypes with more than 5 cases detected, versus Ct value
ggboxplot(season_data_RV_CT.5, x = "Genotype", y = "RV_Ct_value") + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8))